*******************
**# A. SITC-HS-SIC
*******************

*****************
**## A.1 HS-SITC 
*****************

* Note: hs92 271000 maps to sitc2 334. In Feenstra's concordance, codes within
* 334 are  3340, 3341, 3342, 3343, 3344, and 3345
* Manually added these to the concordance to replace 334

*Share of an HS accounted for by an SITC
import excel "raw/hs92_sitc2_concordance.xlsx", sheet("Sheet1") firstrow clear

drop if HS92 == ""
	* These are simply excess lines in excel file. Missing values for all vars.
gen sitc2 = substr(S2,1,4)
assert sitc2!=""

gen n = 1
collapse n, by(sitc2 HS92)
egen totn = total(n), by(HS92)
	* totn is the number of HS codes that map to a given 4-digit SITC code
	* Only one hs code has multiple constitutent sitc codes (hs 271000)

gen hs_sitc_weight = 1/totn
	* For given hs, weights sum to 1 across sitc within hs
	* weight<1 only for various sitc codes within hs271000

keep hs_sitc_weight sitc2 HS92
rename HS92 hs92

assert !missing(sitc2)

save "processing/hs_sitc_concordance.dta", replace


****************
**## A.2 SIC-HS 
****************
*Share of an SIC accounted for by an HS
use "raw/hs_sic_naics_imports_89_121_20220627.dta", clear
keep if inrange(year,1992,1995) 

gen hs92 = floor(commodity/10000)
gen n = 1
collapse n, by(hs92 sic)
assert n==1
egen totn = total(n), by(sic)
	*totn is the number of hs codes that map to a given sic code
gen sic_hs_weight = 1/totn
	* For given sic, weights sum to 1 across hs within sic
keep sic hs92 sic_hs_weight
tostring hs92, replace
replace hs92 = "0" + hs92 if length(hs92) == 5
save "processing/sic_hs_concordance.dta", replace
		
	
***********************************
**# B. SITC-HS-SIC for SIC dataset
***********************************

	
*******************************
**## B.1 SITC Rev 2 to HS 1992 
*******************************
* Note: hs92 271000 maps to sitc2 334. In Feenstra's concordance, codes within
* 334 are  3340, 3341, 3342, 3343, 3344, and 3345
* We have manually added these to the concordance to replace 334
import excel "raw/hs92_sitc2_concordance.xlsx", sheet("Sheet1") firstrow clear

drop if HS92 == ""
* These are simply excess lines in excel file. Missing values for all vars.
gen sitc2 = substr(S2,1,4)
assert sitc2!=""

gen n = 1
collapse n, by(sitc2 HS92)
assert n==1
egen totn = total(n), by(sitc2)
	* totn is the number of HS codes that map to a given 4-digit SITC code
gen sitc_hs_weight = 1/totn
	* For given sitc, weights sum to 1 across hs within sitc

keep sitc* HS92
rename HS92 hs92

save "processing/sitc_hs_concordance", replace
	

*******************
**## B.2 HS to SIC 
*******************
use "raw/hs_sic_naics_imports_89_121_20220627.dta", clear
keep if inrange(year,1992,1995) 
gen hs92 = floor(commodity/10000)
gen n = 1
collapse n, by(hs92 sic)
assert n==1
egen totn = total(n), by(hs92)
	* totn is the number of sic codes that map to a given hs code
gen hs_sic_weight = 1/totn
	* For given hs, weights sum to 1 across sic within hs
keep sic hs92 hs_sic_weight
tostring hs92, replace
replace hs92 = "0" + hs92 if length(hs92) == 5
save "processing/hs_sic_concordance", replace

use "raw/hs_sic_naics_imports_89_121_20220627.dta", clear
keep if year == 1989
gen hs = floor(commodity/10000)
gen n = 1
collapse n, by(hs sic)
egen totn = total(n), by(sic)
	* totn is the number of hs codes that map to a given sic code
gen hs_sic_weight = 1/totn
	* For given hs, weights sum to 1 across sic within hs
keep sic hs hs_sic_weight
save "processing/hs_sic_other_tariff_conc", replace


*********************
**## B.3 SITC to SIC 
*********************
use "processing/sitc_hs_concordance", clear
joinby hs92 using "processing/hs_sic_concordance", unmatched(both)
	* m=1: 19 hs92 codes (0.34%) only in sitc->hs concordance
	* m=2: 26 hs92 codes (0.46%) only in hs->sic concordance
keep if _m == 3
drop _m

*We drop some of the HS codes that SITC codes map to, so we need to scale the 
*sitc_hs_weight up accordingly
bys hs92 sitc2: gen obs = _n

gen sitc_hs_scale_weight = sitc_hs_weight if obs == 1
egen scale = total(sitc_hs_scale_weight), by(sitc2)
	* 14 obs with scale != 1
replace sitc_hs_weight = sitc_hs_weight/scale

gen sitc_sic_weight = sitc_hs_weight*hs_sic_weight

collapse (sum) sitc_sic_weight, by(sitc2 sic)
	* For given sitc2, weights sum to 1 across sic within sitc

save "processing/sitc2_sic_concordance", replace

*********************
**## B.4 SIC to SITC
*********************
use "processing/sic_hs_concordance", clear
joinby hs92 using "processing/hs_sitc_concordance", unmatched(both)
	* m=1: 30 hs92 codes (0.34%) only in sic->hs concordance
	* m=2: 19 hs92 codes (0.46%) only in hs->sitc concordance
keep if _m == 3
drop _m

*We drop some of the HS codes that SIC codes map to, so we need to scale the 
*sic_hs_weight up accordingly
bys sic hs92: gen obs = _n

gen sic_hs_scale_weight = sic_hs_weight if obs == 1
egen scale = total(sic_hs_scale_weight), by(sic)
	* 577 obs with scale != 1
replace sic_hs_weight = sic_hs_weight/scale

gen sic_sitc_weight = sic_hs_weight*hs_sitc_weight

collapse (sum) sic_sitc_weight, by(sitc2 sic)
	* For given sic, weights sum to 1 across sitc2 within sic

save "processing/sic_sitc2_concordance", replace


*******************************************
**# C. 1930 census of manufactures to SIC
*******************************************
import excel "raw/sic_census_concordance_raw.xlsx", sheet("Sheet1") firstrow clear
drop if JohnCheck == "drop"
destring JohnCheck, replace
replace JohnCheck = sic87 if concordance_merge == "Matched (3)"
keep Industry JohnCheck
rename JohnCheck sic87
duplicates drop
gen n = 1
*Share of an SIC accounted for by a given Census industry
egen totn = total(n), by(sic87)
gen proportionate_weight = 1/totn
drop n totn

save "processing/sic_census_concordance_1930_clean", replace


***********************
**# D. Foreign tariffs
***********************

*****************
**## D.1 CCCN-HS
*****************
import delimited "raw/cccn_hs_wits.csv", clear
keep hs198892productcode cccnproductcode
rename (hs198892productcode cccnproductcode) (hs cccn)
save "processing/cccn_hs_wits", replace


****************************************
**## D.2 4-digit CSIC to 5-digit US SIC
****************************************
import excel "raw/canada_us_concordance.xlsx", sheet("Sheet1") clear
drop if _n == 1
replace A = trim(A)

gen us_sic = substr(A,1,5)
gen can_sic = substr(A,-4,4)
gen weight = trim(substr(A,-13,5))

*2411 feeds into non-manufacturing in Canada
drop if substr(weight,1,4) == "2411"
drop A
destring weight, replace
destring can_sic, replace

/*From Trefler
For several industries, my SIC codes are fictitious. 
These occur when I aggregate two codes into a single one because 
of series discontinuities.  The aggregated industries are as follows. 
1094 and 1099 become 1098. 
1511 and 1599 become 1598. 
1995 and 1999 become 1998. 
2911 and 2919 become 2918. 
2951 and 2959 become 2958. 
3051 and 3059 become 3058. 
3351 and 3359 become 3358. 
3362 and 3369 become 3368.
*/

replace can_sic = 1098 if can_sic == 1094 | can_sic == 1099
replace can_sic = 1598 if can_sic == 1511 | can_sic == 1599
replace can_sic = 1998 if can_sic == 1995 | can_sic == 1999
replace can_sic = 2918 if can_sic == 2911 | can_sic == 2919
replace can_sic = 2958 if can_sic == 2951 | can_sic == 2959
replace can_sic = 3058 if can_sic == 3051 | can_sic == 3059
replace can_sic = 3358 if can_sic == 3351 | can_sic == 3359
replace can_sic = 3368 if can_sic == 3362 | can_sic == 3369

*Weight is the share of a US SIC accounted for by a CAN SIC
collapse (sum) weight, by(us_sic can_sic)
save "processing/us_sic_condorance", replace


*****************************************
**# E. Concord IPUMS occ1990 & occ1990dd 
*****************************************
* Unfortunately, IPUMS occ1990 is not exactly the same as Census occ1990 which
* is the basis for ADH occ1990dd variable and all of their other occ-based variables

***************
**## E1. Setup 
***************

import excel "raw/occ1990_xwalk.xls", sheet("STATA") firstrow clear
rename OCC1990 occ1990
	* IPUMS occ1990 variable
rename C census1990
	* census 1990 occupation code
save "processing/occ1990_census1990.dta", replace

******************************************
**## E2. Concord occ1900dd to Census 1990
******************************************
* Import Dorn concordance
use "raw/occ1990_occ1990dd.dta", clear

	* Add occupation groups using subfile_occ1990dd_occgroups.do from [A10] on David Dorn's website 
	
	*******************************************************************
	* Occ1990dd Occupation Groups
	*******************************************************************

	* David Dorn, version October 12, 2012

	* The file can be called whenever the data in memory contains the
	* variable occ1990dd.


	********************************************************************
	* Occupation Dummies Level 3: Subgroups of Low-Skill Service Occupations
	********************************************************************

	gen occ3_clean=0            /* housekeeping, cleaning, laundry */
	replace occ3_clean=1 if     (occ1990dd>=405 & occ1990dd<=408)

	gen occ3_protect=0        /* all protective service */
	replace occ3_protect=1 if (occ1990dd>=415 & occ1990dd<=427)

	gen occ3_guard=0          /* supervisors of guards; guards */
	replace occ3_guard=1 if (occ1990dd==415 | (occ1990dd>=425 & occ1990dd<=427))

	gen occ3_food=0           /* food preparation and service occs */
	replace occ3_food=1 if (occ1990dd>=433 & occ1990dd<=444)

	gen occ3_shealth=0        /* health service occs (dental ass., health/nursing aides) */
	replace occ3_shealth=1 if (occ1990dd>=445 & occ1990dd<=447)

	gen occ3_janitor=0          /* building and grounds cleaning and maintenance occs */
	replace occ3_janitor=1 if (occ1990dd>=448 & occ1990dd<=455)

	gen occ3_beauty=0           /* personal appearance occs */
	replace occ3_beauty=1 if (occ1990dd>=457 & occ1990dd<=458)

	gen occ3_recreation=0      /* recreation and hospitality occs */
	replace occ3_recreation=1 if (occ1990dd>=459 & occ1990dd<=467)

	gen occ3_child=0           /* child care workers */
	replace occ3_child=1 if (occ1990dd==468)

	gen occ3_othpers=0        /* misc. personal care and service occs */
	replace occ3_othpers=1 if (occ1990dd>=469 & occ1990dd<=472)


	********************************************************************
	* Occupation Dummies Level 2: 16 Non-Service Occupation Groups
	********************************************************************

	gen occ2_exec=0          /* executive, administrative and managerial occs */
	replace occ2_exec=1 if (occ1990dd>=3 & occ1990dd<=22)

	gen occ2_mgmtrel=0       /* management related occs */
	replace occ2_mgmtrel=1 if (occ1990dd>=23 & occ1990dd<=37)

	gen occ2_prof=0          /* professional specialty occs */
	replace occ2_prof=1 if (occ1990dd>=43 & occ1990dd<=200)

	gen occ2_tech=0          /* technicians and related support occs */
	replace occ2_tech=1 if (occ1990dd>=203 & occ1990dd<=235)

	gen occ2_finsales=0      /* financial sales and related occs */
	replace occ2_finsales=1 if (occ1990dd>=243 & occ1990dd<=258)

	gen occ2_retsales=0      /* retail sales occs */
	replace occ2_retsales=1 if (occ1990dd>=274 & occ1990dd<=283)

	gen occ2_cleric=0        /* administrative support occs */
	replace occ2_cleric=1 if (occ1990dd>=303 & occ1990dd<=389)

	gen occ2_firepol=0       /* fire fighting, police, and correctional insitutions */
	replace occ2_firepol=1 if (occ1990dd>=417 & occ1990dd<=423)

	gen occ2_farmer=0        /* farm operators and managers */
	replace occ2_farmer=1 if (occ1990dd>=473 & occ1990dd<=475)

	gen occ2_otheragr=0      /* other agricultural and related occs */
	replace occ2_otheragr=1 if (occ1990dd>=479 & occ1990dd<=498)

	gen occ2_mechanic=0      /* mechanics and repairers */
	replace occ2_mechanic=1 if (occ1990dd>=503 & occ1990dd<=549)

	gen occ2_constr=0        /* construction trades */
	replace occ2_constr=1 if (occ1990dd>=558 & occ1990dd<=599)

	gen occ2_mining=0        /* extractive occs */
	replace occ2_mining=1 if (occ1990dd>=614 & occ1990dd<=617)

	gen occ2_product=0       /* precision production occs */
	replace occ2_product=1 if (occ1990dd>=628 & occ1990dd<=699)

	gen occ2_operator=0      /* machine operators, assemblers, and inspectors */
	replace occ2_operator=1 if (occ1990dd>=703 & occ1990dd<=799)

	gen occ2_transp=0        /* transportation and material moving occs */
	replace occ2_transp=1 if (occ1990dd>=803 & occ1990dd<=889)


	********************************************************************
	* Occupation Dummies Level 1: 6 Aggregate Occupation Groups
	* as used in Autor and Dorn, "The Growth of Low-Skill Service Jobs
	* and the Polarization of the U.S. Labor Market"
	********************************************************************

	* (1) management/professional/technical/financial sales/public security occs
	gen occ1_managproftech=occ2_exec+occ2_mgmtrel+occ2_prof+occ2_tech+occ2_finsales+occ2_firepol

	* (2) administrative support and retail sales occs
	gen occ1_clericretail=occ2_cleric+occ2_retsales

	* (3) low-skill services
	gen occ1_service=occ3_clean+occ3_guard+occ3_food+occ3_shealth+occ3_janitor+occ3_beauty+occ3_recreation+occ3_child+occ3_othpers

	* (4) precision production and craft occs
	gen occ1_product=occ2_product

	* (5) machine operators, assemblers and inspectors
	gen occ1_operator=occ2_operator

	* (6) transportation/construction/mechanics/mining/agricultural occs
	gen occ1_transmechcraft=occ2_transp+occ2_constr+occ2_mechanic+occ2_mining+occ2_farmer+occ2_otheragr



	* set values to missing for missing/non-civilian occupations *
	foreach var of varlist occ1_* occ2_* occ3_* {
	   quietly replace `var'=. if occ1990dd<4 | occ1990dd>889 | occ1990dd==.
	}

assert occ1_managproftech+occ1_clericretail+occ1_service+occ1_product+occ1_operator+occ1_transmechcraft==1 if occ1990dd<4 | occ1990dd>889 | occ1990dd==.

* Clean up
rename occ census1990	
gen occGroup=1 if occ1_m==1
replace occGroup=2 if occ1_c==1
replace occGroup=3 if occ1_s==1
replace occGroup=4 if occ1_p==1
replace occGroup=5 if occ1_o==1
replace occGroup=6 if occ1_t==1

label define occGroupLbl 1 "Managers, professional, technology, finance, public safety" 2 "Clerical, retail sales" 3 "Low skill services" 4 "Production, craft" 5 "Machine operators, assemblers" 6 "Transport, construction, mechanical, mining, farm"
label values occGroup occGroupLbl

save "processing/occ1990_occ1990dd_withOccGroups_GLL.dta", replace


**********************************************
**## E3. Concord Census 1990 to IPUMS occ1990 
**********************************************
* Concord
use "processing/occ1990_census1990.dta", clear
mmerge census1990 using "processing/occ1990_occ1990dd_withOccGroups_GLL.dta", type(1:1) 
label define match 1 "only in census crosswalk" 2 "only in dorn crosswalk" 3 "in both"
label values _merge match

* Clean up
sort occ1990 Occupationcategorydescription occ1990dd _merge
quietly by occ1990 Occupationcategorydescription occ1990dd _merge:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup

rename Occupation occupationDescription
label variable occ1990 "IPUMS occ1990 variable"
label variable census1990 "Census 1990 occupation variable"
label variable occ1990dd "David Dorn's consistent 1990 occupation code"
label variable _merge "Merging occ1990-census1990 and census1990-occ1990dd crosswalks"


* Analayze extent of not 1:1 concordance  
count if occ1990<900 & missing(occ1990dd)
bys occ1990: gen Ni=_N
bys occ1990: gen ni=_n
count if ni==1 & !missing(occ1990)
	* # of occ1990 codes
bys occ1990dd: gen nd=_n 
count if nd==1 & !missing(occ1990dd)
	* # of occ1990dd codes
list occ1990 occ1990dd if Ni>1
	* 5 occ1990 codes that map into multiple (non-missing) occ1990dd codes. All other occ1990 codes map into a unique occ1990dd code.
	* That is, nearly always, occ1990dd codes are more aggreated than occ1990 codes.
list occ1990dd if _m==2			
	* This is the single occ1990dd code that doesn't map into a occ1990 code (the reason is that the intervening 1990 census code is
	* in the Dorn occ1990dd-census1990 crosswalk but not in the census1990-occ1990 crosswalk)
drop N* n*

drop occ3_* occ2_* occ1_*
label var occG "David Dorn's occupation group variable"
save "analysis/occ1990_occ1990dd_GLL.dta", replace


***************************************
**# F. BEA commodity - SIC concordance
***************************************

********************************
**## F.1 BEA to SIC concordance
********************************
* The BEA I-O Matrix is identified by commodity code. These must be mapped to SIC
* codes. The BEA provides a concordance from commodities to 1972 SIC codes, but
* some of these are only at the 3-digit level. We identify the 4-digit codes within
* a 3-digit category using the list of codes from the NBER

import excel "raw/sic_1972_full", clear
drop B
rename A sic
gen sic3 = floor(sic/10)
save "processing/sic_1972_3_and_4", replace

*We need to calculate the share of a BEA commodity accounted for by SIC categories.
*To do this, we will use SIC72 shipments, again from NBER. This is 1987 value.
use "raw/conc_sic72_sic87.dta", clear
collapse tot72, by(sic72)
save "processing/shipments_sic72", replace

*Calculate share of BEA commodity accounted for by SIC72
import excel "raw/concordance_1977.xlsx", sheet("Sheet1") clear

rename A commodity
rename B sic4

*Some commodities are mapped to 3-digit SIC. Merging to full set of SIC codes to identify
*the 4-digit SIC codes
gen sic3 = sic4 if sic4 < 1000
joinby sic3 using "processing/sic_1972_3_and_4", unmatched(both)
drop if _m == 2
	* m=1: true sic4 codes in bea commodity->sic4 concordance, so keep
	* m=3: sic3 codes that not do nt appear as raw sic3 codes in commodity -> sitc4 concordance
replace sic4 = sic if _m == 3
keep commodity sic4
rename sic sic72

*Weighting SIC categories by shipments
merge m:1 sic72 using "processing/shipments_sic72"
	* m=1: 6 sic inds (2%) without shipments data
	* m=2: 3 sic inds (1%) with shipments data that not in commodity->sic4 concordance
keep if _m == 3
drop _merge

egen tot_com = total(tot72), by(commodity)
gen sic72_share = tot72/tot_com

keep comm sic72 sic72_share

save "processing/commodity_sic72_conc", replace

* The above is the share of a commodity accounted for by a 1972 SIC category. We
* need to map this to the 1987 SIC
joinby sic72 using "raw/conc_sic72_sic87.dta", unmatched(both)
	* m=1: no obs
	* m=2: 3 sic72 codes (0.56%) in sic72->sic87 concordance but not in commodity->sic72 concordance
keep if _m == 3

gen comm_sic87_share = sic72_share*sh7287

collapse (sum) comm_sic87_share, by(commodity sic87)

save "processing/commodity_sic87_conc", replace

**************************************
**#### C.3.1.2 SIC to BEA concordance
**************************************
* The above provides a concordance from commodities to SIC categories. We also 
* need to construct a concordance from SIC categories to commodities -- what share
* of an SIC code maps to a given commodity
import excel "raw/concordance_1977.xlsx", sheet("Sheet1") clear

rename A commodity
rename B sic4

* Some commodities are mapped to 3-digit SIC. Merging to SIC Tariff Codes to identify
* the 4-digit SIC codes
gen sic3 = sic4 if sic4 < 1000
joinby sic3 using "processing/sic_1972_3_and_4", unmatched(both)
	* m=1: true sic4 codes in bea commodity->sic4 concordance, so keep
	* m=3: sic3 codes that not do nt appear as raw sic3 codes in commodity -> sitc4 concordance
drop if _m == 2

replace sic4 = sic if _m == 3

keep commodity sic4
rename sic sic72

*Share of an SIC accounted for by a commodity
gen n = 1
egen totn = total(n), by(sic72)
gen sic_comm_share = 1/totn

keep comm sic72 sic_comm_share

save "processing/sic72_commodity_conc", replace

*We want the share of an SIC87 accounted for by a given commodity.
*This equals the share of an SIC87 accounted for by a given SIC72, multiplied by
*the share of the SIC72 accounted for by the commodity

use "raw/conc_sic72_sic87.dta", clear
replace sh8772 = .5 if sic87 == 3672
keep sic72 sic87 sh8772

joinby sic72 using "processing/sic72_commodity_conc.dta", unmatched(both)
	* m=1: 3 sic inds (1%) with shipments data that not in commodity->sic4 concordance
	* m=2: 6 sic inds (2%) without shipments data
keep if _m == 3

gen sic87_comm_share = sh8772*sic_comm_share

collapse (sum) sic87_comm_share, by(sic87 commodity)

save "processing/sic87_commodity_conc", replace 

